1 Analysis of 1000 genomes genotypes and MyHeritage data utilizing ancestry informative SNPs from Kidd et al. and Seldin et al.

The main idea is taken from Kevin Arvai’s repo using a Python notebook.

1.1 Load data

1.2 Plot 1K Genomes Project data using Kid et al. ancestry informative SNPs (AISNPs)

1.2.1 Formatting SNP data

df = as.data.frame(kid_vcf_file@gt) |> 
  dplyr::select(rownames(df_samples)) |> 
  mutate(rs_id=getFIX(kid_vcf_file)[,3]) |> 
  column_to_rownames("rs_id") |> 
  t() |> 
  as.data.frame() 

df[df=="0|0"]<-0
df[df=="0|1" | df=="1|0"]<-1
df[df=="1|1"]<-3

dim(df)
## [1] 2504   55
nrow(df) == nrow(df_samples) 
## [1] TRUE

1.2.2 Performing dimensionality reduction with Factor Analysis of Mixed Data (FAMD) and plotting

famd <- FAMD(df, graph=FALSE)
plot <- fviz_pca_ind(famd)
fig <- plot_ly(data = plot$data |> 
         left_join(df_samples |> 
                     rownames_to_column("name"),
                   by="name"), 
               x = ~x, 
               y = ~y,
               color=~super_pop)
fig

1.3 Plot 1K Genomes Project and my own MyHeritage data using Kid et al. ancestry informative SNPs (AISNPs)

1.3.1 Formatting SNP data

# kid_vcf_file@fix |> 
#   as.data.frame() |> 
#   dplyr::select(c(ID, REF, ALT))

myheritage_raw_sub <- myheritage_raw |> 
  filter(RSID %in% colnames(df)) |> 
  left_join(kid_vcf_file@fix |> 
              as.data.frame() |> 
              dplyr::select(c(ID, REF, ALT)),
            by=c("RSID"="ID")) |> 
  mutate(ALLELE1=substr(RESULT, 1, 1),
         ALLELE2=substr(RESULT, 2, 2),
         gt=ifelse(ALLELE1 == REF & 
                     ALLELE2 == REF,
                   0,
                   ifelse((ALLELE1 == REF & ALLELE2 == ALT) |
                            (ALLELE2 == REF & ALLELE1 == ALT),
                          1,
                          ifelse(ALLELE1 == ALT &
                                   ALLELE2 == ALT, 3,
                                 2)))) 
# table(myheritage_raw_sub$gt)
# 7 SNPs are missing from my data (could be imputed)

df_updated <- df |> 
  t() |> 
  as.data.frame() |> 
  rownames_to_column("RSID") |> 
  #filter(RSID %in% myheritage_raw_sub$RSID) |> 
  left_join(myheritage_raw_sub |> 
              dplyr::select(c(RSID, gt)),
            by="RSID") |> 
  column_to_rownames("RSID") |> 
  mutate_all(function(x) as.numeric(x)) |> 
  rowwise() %>%
  mutate(gt=ifelse(is.na(gt), 
                   which.max(table(c_across())),
                   gt), .) |> 
  mutate_all(function(x) as.character(x)) |> 
  t()

1.3.2 Performing dimensionality reduction with Factor Analysis of Mixed Data (FAMD) and plotting

famd <- FAMD(df_updated, graph=FALSE)
plot <- fviz_pca_ind(famd)
plot_data <- plot$data |> 
  left_join(df_samples |> 
              rownames_to_column("name"),
            by="name") %>% 
  mutate(pop=ifelse(name=="gt", "Me", pop),
         super_pop=ifelse(name=="gt", "Me", super_pop),
         symb=ifelse(name=="gt", "Me", "Other"))

fig <- plot_ly(data = plot_data, 
               x = ~x, 
               y = ~y,
               color=~super_pop,
               symbol = ~symb, 
               symbols = c("x", "circle"),
               size=ifelse(plot_data$name=="gt", 8, 3))
fig

1.4 Eucledian distance

euclidian = function(
    mat1,       # Matrix with observations in COLUMNS.
    mat2=mat1   # Matrix with observations in COLUMNS.
) {
  apply(mat1, 2, function(yi) sqrt(colSums((mat2 - yi)^2)))
}

d <- euclidian(t(plot_data[plot_data$name=="gt",2:3]),
               t(plot_data[!plot_data$name=="gt",2:3])) |> 
  as.data.frame() |> 
  dplyr::rename("euc_dist"="2505") |> 
  mutate(sample=plot_data$name[plot_data$name!="gt"]) |> 
  arrange(euc_dist) |> 
  left_join(df_samples |> 
              rownames_to_column("sample"),
            by="sample")